#!/usr/bin/env python3
"""
Example: Using Video Input Mode with Cooperative Command Game

Demonstrates how to use the new video input functionality to provide
actual video content to multimodal models instead of individual frames.
"""

import numpy as np
import base64
from qwen_eval_simple import MultiProviderEvaluator

def run_video_input_example():
    """Run a short evaluation using video input mode."""
    
    print("🎬 Video Input Mode Example")
    print("=" * 50)
    
    # Create evaluator with video input mode
    evaluator = MultiProviderEvaluator(
        difficulty="normal",
        seed_index=0,
        max_rounds=15,
        api_provider="qwen",  # Use your preferred provider
        input_mode="video",   # 🎥 This is the key change!
        save_media=True,
        deterministic_commands=True
    )
    
    print(f"📋 Configuration:")
    print(f"  - Input mode: {evaluator.input_mode}")
    print(f"  - Recording mode: {evaluator.env.recording_mode}")
    print(f"  - Video input mode: {evaluator.env.video_input_mode}")
    print(f"  - Frame buffer size: {evaluator.env.max_buffer_frames}")
    
    # Run a few steps to demonstrate video input
    print(f"\n🚀 Starting evaluation with video inputs...")
    
    # Reset environment
    obs, info = evaluator.env.reset()
    print(f"Initial observation keys: {list(obs.keys()) if isinstance(obs, dict) else 'vector only'}")
    
    for step in range(5):
        print(f"\n--- Step {step + 1} ---")
        
        # Check what type of video data we're getting
        if isinstance(obs, dict) and 'video' in obs:
            try:
                video_data = base64.b64decode(obs['video'])
                is_jpeg = video_data.startswith(b'\xff\xd8\xff')
                is_mp4 = (video_data[4:12] == b'ftypmp4' or 
                         video_data[4:12] == b'ftypisom' or
                         video_data[4:8] == b'ftyp' or
                         video_data.startswith(b'\x00\x00\x00') and b'ftyp' in video_data[:20])
                
                data_size = len(video_data)
                buffer_size = len(evaluator.env.frame_buffer) if hasattr(evaluator.env, 'frame_buffer') else 0
                
                if is_mp4:
                    print(f"  📹 Sending MP4 video to model ({data_size:,} bytes from {buffer_size} frames)")
                    content_type = "video"
                elif is_jpeg:
                    print(f"  📸 Sending JPEG frame to model ({data_size:,} bytes)")
                    content_type = "image"
                else:
                    print(f"  ❓ Unknown content type ({data_size:,} bytes)")
                    content_type = "unknown"
                    
            except Exception as e:
                print(f"  ❌ Error analyzing video data: {e}")
                content_type = "error"
        else:
            print(f"  ⚠️ No video data in observation")
            content_type = "none"
        
        # For demonstration, just use a simple action instead of querying the model
        # In real usage, you would pass obs to evaluator._query_model()
        action = [step % evaluator.env.num_members, step % 5, 30 + step*10, 40 + step*5]
        
        # Execute action
        obs, reward, terminated, truncated, info = evaluator.env.step(action)
        
        print(f"  🎯 Action: {action}")
        print(f"  🏆 Reward: {reward:.2f}")
        print(f"  📊 Score: {info.get('score_normalized', 0):.1f}/100")
        
        if terminated:
            print("  🏁 Episode terminated")
            break
    
    # Clean up
    evaluator.env.close()
    
    print(f"\n✅ Video input example completed!")
    print(f"📁 Media files saved to: {evaluator.output_dir}")
    
    # Show what video files were created
    if evaluator.save_media:
        video_files = list(evaluator.videos_dir.glob("*.mp4"))
        if video_files:
            print(f"\n🎥 Video files created:")
            for video_file in video_files:
                print(f"  - {video_file.name}")
        else:
            print(f"\n⚠️ No video files found in {evaluator.videos_dir}")

def compare_input_modes():
    """Compare different input modes side by side."""
    
    print("\n🔄 Comparing Input Modes")
    print("=" * 50)
    
    input_modes = ["image_audio", "video"]
    
    for mode in input_modes:
        print(f"\n🎯 Testing {mode} mode:")
        
        evaluator = MultiProviderEvaluator(
            difficulty="normal",
            seed_index=1,
            max_rounds=10,
            api_provider="qwen",
            input_mode=mode,
            save_media=False,  # Disable saving for quick test
            deterministic_commands=True
        )
        
        obs, info = evaluator.env.reset()
        
        # Take a few steps to build up frame buffer
        for i in range(3):
            action = [0, 0, 50, 50]
            obs, reward, terminated, truncated, info = evaluator.env.step(action)
        
        # Check observation structure
        if isinstance(obs, dict):
            print(f"  Observation keys: {list(obs.keys())}")
            
            if 'video' in obs:
                try:
                    video_data = base64.b64decode(obs['video'])
                    is_mp4 = (video_data[4:12] == b'ftypmp4' or 
                             video_data[4:12] == b'ftypisom' or
                             video_data[4:8] == b'ftyp')
                    
                    if is_mp4:
                        print(f"  ✅ Video content: MP4 ({len(video_data):,} bytes)")
                    else:
                        print(f"  📸 Video content: JPEG frame ({len(video_data):,} bytes)")
                except:
                    print(f"  ❌ Video content: Could not decode")
            
            if 'image' in obs:
                image_shape = obs['image'].shape if hasattr(obs['image'], 'shape') else 'unknown'
                print(f"  📸 Image: {image_shape}")
            
            if 'audio' in obs:
                try:
                    import json
                    audio_data = json.loads(obs['audio'])
                    print(f"  🔊 Audio: {len(audio_data)} events")
                except:
                    print(f"  🔊 Audio: present")
        else:
            print(f"  Vector observation only")
        
        evaluator.env.close()

if __name__ == "__main__":
    print("🎥 Video Input Mode Examples")
    print("=" * 60)
    
    run_video_input_example()
    compare_input_modes()
    
    print("\n🎯 Summary:")
    print("- Use input_mode='video' to send actual MP4 videos to models")
    print("- Use input_mode='image_audio' for traditional image+audio inputs")  
    print("- Video mode buffers recent frames and creates short video clips")
    print("- Videos are more informative for understanding temporal changes")
    print("- Video files are automatically saved with proper .mp4 extension")
    print("\n📖 Next steps:")
    print("- Update your evaluation scripts to use input_mode='video'")
    print("- Test with your multimodal models that support video input")
    print("- Adjust video_fps and buffer size based on your needs") 